# ------------------------------------------------------------------
# ' Figure 2 : what people talk
# ' also this file check the quality/reliability of text codings
# ------------------------------------------------------------------
library(here)
library(data.table)
library(rio)
library(dplyr)
library(tidyr)
library(ggplot2)
library(tidyr)
library(dplyr)
library(hrbrthemes)

coding_match = fread(here('data','manual_coding','tess_coding_matched.csv'))

# compare % match 
# naive matches 
coding_match[,n_category:=uniqueN(category),by="coding_id"]
coding_match[,id_category:=seq(from=1,to=.N,by=1),by="coding_id"]

mean(coding_match[,matched]) # 63% 
mean(coding_match[id_category==1,p_matched]==1) # 59.1% are hundred percent match
mean(coding_match[id_category==1,p_matched]>0) # 88.9% have at least one match 
mean(coding_match[id_category==1,p_matched]) # 74% have matches 

# reclassify category based on three schemes 
text_match = fread(here('data','manual_coding',"textcoding_idmatch.csv"))

text_raw = merge(x=text_match,y=coding_match,by.x="coding_id",by.y="coding_id",all.x=TRUE)
# -- extract year/id information from text coding id
text_raw[,year := as.integer(substr(as.character(uid),1,4))]
text_raw[,id := as.integer(substr(as.character(uid),5,8))]

# -- coding category match file 
category_match = fread(here('data','manual_coding','coding_category_match.csv'))
category_match[,category := stringr::str_to_title(category)]

# check discrepancy 
#setdiff(unique(coding_match[,category]),category_match[,category])
#setdiff(category_match[,category],unique(coding_match[,category]))

text_all = merge(x = text_raw, y = category_match, by.x = "category", by.y = "category", all.x = TRUE)

convert_to_mean = function(text_set,coding_name="tess23",coder_name="bk"){
	text_set$"Other/undefined"[text_set$"Other/undefined" > 0 & rowSums(text_set %>% select(-one_of(c("uid","year")))) > 1] <- 0 
	mean_out = t(text_set %>% 
		group_by(year) %>% 
		select(-one_of(c("uid"))) %>% 
		summarize_each(funs=function(x) mean(x>0)))

	mean_out = mean_out[-1,]
	N = nrow(mean_out)

	data.table(category=c(rownames(mean_out),rownames(mean_out),rownames(mean_out)),
		tess2016=c(mean_out[,1],mean_out[,2],mean_out[,3]),
		data_source=c(rep("NC1997",N),rep("TESS2010",N),rep("TESS2016",N)),
		coding_scheme=coding_name,
		coder=coder_name)
}

# combine these two
text_all = text_all[!is.na(category),]

# ---- all cases 
all_2016 <- text_all %>% dcast(uid + year ~ category, value.var ="category") %>%
	convert_to_mean("tess23","all")
all_2013 <- text_all %>% dcast(uid + year ~ Small_2013, value.var ="category") %>%
	convert_to_mean("Small_2013","all")
all_2014 <- text_all %>% dcast(uid + year ~ Brashears_2014, value.var ="category") %>%
	convert_to_mean("Brashears_2014","all")
all_2004 <- text_all %>% dcast(uid + year ~ BP_2004, value.var ="category") %>%
	convert_to_mean("BP_2004","all")

coding_output = rbind(all_2016, all_2013, all_2014, all_2004)
names(coding_output)[2] = 'prop'

# ----- basic plot 
coding_type = c("BP_2004","Small_2013","Brashears_2014")
coding_type_name = c("Bearman and Parigi (2004) scale",
	"Small (2013) scale","Brashears (2014) scale")

pdf(here('results','figures','figure_what_to_talk.pdf'),paper='special',width=9,height=5)
layout(matrix(1:3,1,3))
for (ctype in coding_type){
	tab = coding_output %>% filter(coding_scheme == ctype) %>% 
		spread(key="data_source",value="prop") %>% select(c('category','NC1997','TESS2010','TESS2016'))
	tab = tab[order(tab$"TESS2016"),]
	rownames(tab) <- tab[,1]
	tab = tab[,-1]
	par(mar=c(5,8,4,1))
	b <- barplot(t(tab),beside=TRUE,
		density=c(50,100,50) , 
		angle=c(11,0,90),
		horiz=TRUE,col=c("gray","skyblue","pink"),axisname=FALSE,
		legend=colnames(tab),args.legend=list(bty="n",x="bottomright"),xlim=c(0,0.5),
		main=coding_type_name[match(ctype,coding_type)])
	axis(2,b[1,],rownames(tab),las=1,cex.axis=0.9)
	abline(v=c(0.1,0.2,0.3,0.4,0.5),col="black",lwd=1,lty=2)
}
dev.off()

